Identifying Amino Acid Patterns via Phage Display + NGS + Machine Learning

2017/05/10 by Andrew Chang

In [6]:
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, Image
In [7]:
Image('~/F1.png')
Out[7]:
In [8]:
Image('~/F2.png')
Out[8]:
In [9]:
Image('~/F3.png')
Out[9]:
In [12]:
Image('~/F4.png')
Out[12]:
In [13]:
Image('~/F5.png')
Out[13]:
In [14]:
Image('~/F6.png')
Out[14]:
In [29]:
Image('~/F7.png')
Out[29]:
In [25]:
Image('~/F8.png', width=500, height=500)
Out[25]:
In [30]:
df = pd.read_csv('~/demo_seq.csv')
In [36]:
# check the size of the data
print "size of df:", df.shape
size of df: (79483, 7)

Physical-Chemical Properties of Amino Acids

In [32]:
aaindex_7 = pd.read_csv('~/AAIndex_Andrew_choice.csv', header=0, index_col=0)
display(aaindex_7)
Average accessible surface area Free energy of solution in water Polarity Isoelectric point Molecular_Weight Hydrophobicity Rigidity_VHSE8
Amino Acid
A 0.141 0.262 0.000 0.404 0.436273 0.619355 0.175510
R 0.905 0.169 1.000 1.000 0.852960 0.264516 0.442857
N 0.510 0.313 0.065 0.330 0.646918 0.174194 0.110204
D 0.515 0.601 0.956 0.000 0.651765 0.000000 0.387755
C 0.000 0.947 0.028 0.285 0.593253 0.670968 0.234694
Q 0.608 0.416 0.068 0.360 0.715615 0.290323 0.189796
E 0.602 0.561 0.960 0.056 0.720413 0.154839 0.277551
G 0.103 0.240 0.000 0.401 0.367576 0.354839 0.000000
H 0.402 0.313 0.992 0.603 0.759732 0.406452 0.406122
I 0.083 0.424 0.003 0.407 0.642315 0.993548 0.246939
L 0.138 0.463 0.003 0.402 0.642315 0.980645 0.146939
K 1.000 0.313 0.952 0.872 0.715811 0.206452 0.300000
M 0.206 0.405 0.028 0.372 0.730598 0.832258 0.134694
F 0.114 0.462 0.007 0.339 0.808843 1.000000 0.232653
P 0.411 0.000 0.030 0.442 0.563727 0.058065 1.000000
S 0.303 0.240 0.032 0.364 0.514567 0.322581 0.295918
T 0.337 0.313 0.032 0.362 0.583264 0.438710 0.353061
W 0.219 0.537 0.040 0.390 1.000000 0.980645 0.100000
Y 0.454 1.000 0.031 0.362 0.887186 0.761290 0.167347
V 0.094 0.369 0.003 0.399 0.573618 0.845161 0.267347
O 0.000 0.000 0.000 0.000 0.000000 0.000000 0.000000

Protein Sequence Encoding

Turn strings into numerical numbers so that machine learning algorithm can handel them.

In [33]:
def generate_seq_ft_lb(s_data_num):
    seqs_num = s_data_num.var_seq 

    # converting aa seq into number as shown in aa_label
    seq_feature = []
    for seq in seqs_num:
        string = pd.Series(list(seq))
        string.reset_index(drop=True, inplace=True) # reset the index for the next step
        for index, charc in enumerate(string):
            string[index] = aaindex_7.loc[charc,:].tolist()

        seq_feature.append(string)

    seq_feature = np.array(seq_feature)
    seq_label = s_data_num.Label.reset_index(drop=True)
    seq_label = np.array(seq_label).astype(int)

    num_total_sample = seq_feature.shape[0]
    num_aa = seq_feature.shape[1]
    num_chemprop = seq_feature.shape[2]
    
    print '%d samples. Each sample has %d features, with %d values per feature.' %(num_total_sample, num_aa, num_chemprop)
    print '%d labels.' %(seq_label.shape[0])
    #print "Feature of the original 1st data: \n", seq_feature[0]
    
    ## reshape it to each sample has 15*11 features. feature 0-10 is aa1, 11-21 is aa2...
    seq_feature = seq_feature.reshape(num_total_sample,num_aa*num_chemprop)
    print ""
    print 'The shape of transformed features', seq_feature.shape
    print "Feature of the original 1st data: \n", seq_feature[0]
    print '====================================================================\n'
    
    return(seq_feature, seq_label, num_aa, num_chemprop)
In [34]:
# feed in the data for encoding
seq_feature, seq_label, num_aa, num_chemprop = generate_seq_ft_lb(df)
79483 samples. Each sample has 41 features, with 7 values per feature.
79483 labels.

The shape of transformed features (79483, 287)
Feature of the original 1st data: 
[0.30299999999999999 0.23999999999999999 0.032000000000000001
 0.36399999999999999 0.51456690999999999 0.322580645 0.29591836700000002
 0.083000000000000004 0.42399999999999999 0.0030000000000000001
 0.40700000000000003 0.64231503700000003 0.99354838700000003
 0.24693877600000003 0.30299999999999999 0.23999999999999999
 0.032000000000000001 0.36399999999999999 0.51456690999999999 0.322580645
 0.29591836700000002 0.33700000000000002 0.313 0.032000000000000001
 0.36200000000000004 0.58326396700000005 0.43870967700000002
 0.35306122400000001 0.114 0.46200000000000002 0.0069999999999999993
 0.33899999999999997 0.80884297099999991 1.0 0.23265306100000002
 0.51000000000000001 0.313 0.065000000000000002 0.33000000000000002
 0.64691769099999996 0.174193548 0.11020408199999999 0.083000000000000004
 0.42399999999999999 0.0030000000000000001 0.40700000000000003
 0.64231503700000003 0.99354838700000003 0.24693877600000003
 0.20600000000000002 0.40500000000000003 0.027999999999999997
 0.37200000000000005 0.73059785499999996 0.83225806499999999
 0.13469387800000002 0.10300000000000001 0.23999999999999999 0.0
 0.40100000000000002 0.36757577200000002 0.35483871 0.0 0.21899999999999997
 0.53700000000000003 0.040000000000000001 0.39000000000000001 1.0
 0.98064516099999999 0.10000000000000001 0.45399999999999996 1.0 0.031
 0.36200000000000004 0.88718601599999991 0.76129032299999999 0.167346939
 0.60799999999999998 0.41600000000000004 0.068000000000000005
 0.35999999999999999 0.71561474800000002 0.29032258100000002
 0.18979591800000001 0.13800000000000001 0.46299999999999997
 0.0030000000000000001 0.40200000000000002 0.64231503700000003
 0.98064516099999999 0.14693877599999999 0.14099999999999999
 0.26200000000000001 0.0 0.40399999999999997 0.43627283 0.61935483899999999
 0.17551020399999998 0.33700000000000002 0.313 0.032000000000000001
 0.36200000000000004 0.58326396700000005 0.43870967700000002
 0.35306122400000001 0.30299999999999999 0.23999999999999999
 0.032000000000000001 0.36399999999999999 0.51456690999999999 0.322580645
 0.29591836700000002 0.094 0.36899999999999999 0.0030000000000000001
 0.39899999999999997 0.57361797999999997 0.84516128999999995
 0.26734693900000001 0.10300000000000001 0.23999999999999999 0.0
 0.40100000000000002 0.36757577200000002 0.35483871 0.0 0.51500000000000001
 0.60099999999999998 0.95599999999999996 0.0 0.65176516699999998 0.0
 0.38775510200000002 0.51000000000000001 0.313 0.065000000000000002
 0.33000000000000002 0.64691769099999996 0.174193548 0.11020408199999999
 0.083000000000000004 0.42399999999999999 0.0030000000000000001
 0.40700000000000003 0.64231503700000003 0.99354838700000003
 0.24693877600000003 0.51000000000000001 0.313 0.065000000000000002
 0.33000000000000002 0.64691769099999996 0.174193548 0.11020408199999999
 0.45399999999999996 1.0 0.031 0.36200000000000004 0.88718601599999991
 0.76129032299999999 0.167346939 0.14099999999999999 0.26200000000000001
 0.0 0.40399999999999997 0.43627283 0.61935483899999999 0.17551020399999998
 0.51500000000000001 0.60099999999999998 0.95599999999999996 0.0
 0.65176516699999998 0.0 0.38775510200000002 0.33700000000000002 0.313
 0.032000000000000001 0.36200000000000004 0.58326396700000005
 0.43870967700000002 0.35306122400000001 0.10300000000000001
 0.23999999999999999 0.0 0.40100000000000002 0.36757577200000002 0.35483871
 0.0 0.33700000000000002 0.313 0.032000000000000001 0.36200000000000004
 0.58326396700000005 0.43870967700000002 0.35306122400000001 0.094
 0.36899999999999999 0.0030000000000000001 0.39899999999999997
 0.57361797999999997 0.84516128999999995 0.26734693900000001
 0.10300000000000001 0.23999999999999999 0.0 0.40100000000000002
 0.36757577200000002 0.35483871 0.0 0.21899999999999997 0.53700000000000003
 0.040000000000000001 0.39000000000000001 1.0 0.98064516099999999
 0.10000000000000001 0.083000000000000004 0.42399999999999999
 0.0030000000000000001 0.40700000000000003 0.64231503700000003
 0.99354838700000003 0.24693877600000003 0.51000000000000001 0.313
 0.065000000000000002 0.33000000000000002 0.64691769099999996 0.174193548
 0.11020408199999999 0.10300000000000001 0.23999999999999999 0.0
 0.40100000000000002 0.36757577200000002 0.35483871 0.0 0.90500000000000003
 0.16899999999999998 1.0 1.0 0.85295989799999994 0.26451612899999999
 0.44285714299999995 0.41100000000000003 0.0 0.029999999999999999 0.442
 0.56372717000000006 0.058064515999999997 1.0 1.0 0.313 0.95200000000000007
 0.872 0.7158106059999999 0.20645161300000001 0.29999999999999999
 0.60799999999999998 0.41600000000000004 0.068000000000000005
 0.35999999999999999 0.71561474800000002 0.29032258100000002
 0.18979591800000001 0.45399999999999996 1.0 0.031 0.36200000000000004
 0.88718601599999991 0.76129032299999999 0.167346939 0.51500000000000001
 0.60099999999999998 0.95599999999999996 0.0 0.65176516699999998 0.0
 0.38775510200000002 0.45399999999999996 1.0 0.031 0.36200000000000004
 0.88718601599999991 0.76129032299999999 0.167346939]
====================================================================

Split Data for Training and Testing

In [24]:
from sklearn.model_selection import GridSearchCV, validation_curve, StratifiedShuffleSplit
from sklearn import metrics
from sklearn.metrics import classification_report
In [25]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # n_splits=1 is equivalent to regular splitter
sss.get_n_splits(seq_feature, seq_label)
for train_index, test_index in sss.split(seq_feature, seq_label):
    features_train, features_test = seq_feature[train_index], seq_feature[test_index]
    labels_train, labels_test = seq_label[train_index], seq_label[test_index]    
    
print "Shape of Feature Train:", features_train.shape
print "Shape of Feature Test :", features_test.shape
print "# Label Train:", len(labels_train)
print "# Label Test :", len(labels_test)
Shape of Feature Train: (63586, 287)
Shape of Feature Test : (15897, 287)
# Label Train: 63586
# Label Test : 15897

XGBoost Training - 1st Round

In [35]:
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from xgboost import plot_tree
In [36]:
xgbc = XGBClassifier(
    max_depth=10, ## default = 6, range: [1,∞], higher is more likely to overfit
    learning_rate =0.01, # decrease this needs to increase the n_estimator
    n_estimators=2000, # higher value requires lower eta (n_estimators), = number of rounds of boosting (correcting error)
    silent = True,
    objective= 'binary:logistic', # binary:logistic, multi:softmax, multi:softprob
    nthread=-1,
    gamma=0.5,  ## [0,∞]. Larger, harder to split and control model complexity
    min_child_weight=6, ## [0,∞], default = 1, minimum sum of instance weight (hessian) needed in a child, the larger, the more conservative the algorithm will be.
    max_delta_step = 5, ## [0,∞], default = 0, Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update
    subsample=0.6, ##(defalut = 1), [0,1] lower this will prevent overfitting
    colsample_bytree=0.8, ## [0,1], subsample ratio of columns when constructing each tree; control overfitting
    colsample_bylevel=1, # [0,1], subsample ratio of columns for each split, in each level
    reg_alpha=0, # default = 0, L1 regularization term on weights, increase this value will make model more conservative
    reg_lambda=1, # default = 1, L2 regularization term on weights, increase this value will make model more conservative.
    scale_pos_weight=5, ## default = 1, Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases)
    #base_score=0.5, 
    seed=0,
    missing=None)
In [37]:
# put the feature and label and weight into xgb.DMatrix for xgb.cv
dtrain = xgb.DMatrix(features_train, labels_train)
dtest = xgb.DMatrix(features_test, labels_test)

# get the hyperparamters and add 'num_class' for xgb.cv
param = xgbc.get_params()
#param['num_class'] = len(np.unique(labels_11_train_ds))
num_round = 500 xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed = 0, # mlogloss early_stopping_rounds=10, verbose_eval=True, as_pandas = True) #callbacks=[xgb.callback.print_evaluation(show_stdv=False), xgb.callback.early_stop(3)])
In [38]:
# quick training without tuning hyper-parameters to see how good it is
t0 = time() 

xgbc.fit(features_train, labels_train, 
         eval_set = [(features_test, labels_test)], eval_metric = 'auc', 
         early_stopping_rounds=30) #eval_set = watchlist, early_stopping_rounds=10

print "training time:", round(time()-t0, 3), "s" # time the training process
score = xgbc.score(features_test, labels_test)
print "AUC: ", round(score, 2) # print out the scores
    
print classification_report(labels_test, xgbc.predict(features_test))
[0]	validation_0-auc:0.923663
Will train until validation_0-auc hasn't improved in 30 rounds.
[1]	validation_0-auc:0.927868
[2]	validation_0-auc:0.929454
[3]	validation_0-auc:0.932406
[4]	validation_0-auc:0.93383
[5]	validation_0-auc:0.93445
[6]	validation_0-auc:0.93455
[7]	validation_0-auc:0.934683
[8]	validation_0-auc:0.934894
[9]	validation_0-auc:0.935103
[10]	validation_0-auc:0.935118
[11]	validation_0-auc:0.935179
[12]	validation_0-auc:0.935518
[13]	validation_0-auc:0.935637
[14]	validation_0-auc:0.935751
[15]	validation_0-auc:0.935885
[16]	validation_0-auc:0.935878
[17]	validation_0-auc:0.935913
[18]	validation_0-auc:0.936076
[19]	validation_0-auc:0.936206
[20]	validation_0-auc:0.936361
[21]	validation_0-auc:0.936404
[22]	validation_0-auc:0.936467
[23]	validation_0-auc:0.936596
[24]	validation_0-auc:0.936774
[25]	validation_0-auc:0.936823
[26]	validation_0-auc:0.936788
[27]	validation_0-auc:0.936811
[28]	validation_0-auc:0.936901
[29]	validation_0-auc:0.936891
[30]	validation_0-auc:0.937015
[31]	validation_0-auc:0.936995
[32]	validation_0-auc:0.937086
[33]	validation_0-auc:0.937182
[34]	validation_0-auc:0.937186
[35]	validation_0-auc:0.93729
[36]	validation_0-auc:0.937273
[37]	validation_0-auc:0.937278
[38]	validation_0-auc:0.937291
[39]	validation_0-auc:0.937368
[40]	validation_0-auc:0.937405
[41]	validation_0-auc:0.937461
[42]	validation_0-auc:0.937446
[43]	validation_0-auc:0.93743
[44]	validation_0-auc:0.937473
[45]	validation_0-auc:0.937449
[46]	validation_0-auc:0.937469
[47]	validation_0-auc:0.937516
[48]	validation_0-auc:0.937532
[49]	validation_0-auc:0.937505
[50]	validation_0-auc:0.937476
[51]	validation_0-auc:0.937514
[52]	validation_0-auc:0.937497
[53]	validation_0-auc:0.937549
[54]	validation_0-auc:0.93752
[55]	validation_0-auc:0.937491
[56]	validation_0-auc:0.93746
[57]	validation_0-auc:0.937452
[58]	validation_0-auc:0.937444
[59]	validation_0-auc:0.937463
[60]	validation_0-auc:0.937476
[61]	validation_0-auc:0.937465
[62]	validation_0-auc:0.93745
[63]	validation_0-auc:0.937449
[64]	validation_0-auc:0.937455
[65]	validation_0-auc:0.937492
[66]	validation_0-auc:0.937477
[67]	validation_0-auc:0.937491
[68]	validation_0-auc:0.937562
[69]	validation_0-auc:0.937583
[70]	validation_0-auc:0.937606
[71]	validation_0-auc:0.937637
[72]	validation_0-auc:0.937676
[73]	validation_0-auc:0.937673
[74]	validation_0-auc:0.937644
[75]	validation_0-auc:0.937667
[76]	validation_0-auc:0.937724
[77]	validation_0-auc:0.937715
[78]	validation_0-auc:0.937728
[79]	validation_0-auc:0.937748
[80]	validation_0-auc:0.937783
[81]	validation_0-auc:0.93779
[82]	validation_0-auc:0.937768
[83]	validation_0-auc:0.937754
[84]	validation_0-auc:0.937751
[85]	validation_0-auc:0.937798
[86]	validation_0-auc:0.93786
[87]	validation_0-auc:0.937918
[88]	validation_0-auc:0.937922
[89]	validation_0-auc:0.93791
[90]	validation_0-auc:0.937915
[91]	validation_0-auc:0.93795
[92]	validation_0-auc:0.937996
[93]	validation_0-auc:0.938022
[94]	validation_0-auc:0.938007
[95]	validation_0-auc:0.938004
[96]	validation_0-auc:0.938023
[97]	validation_0-auc:0.938042
[98]	validation_0-auc:0.938038
[99]	validation_0-auc:0.938031
[100]	validation_0-auc:0.938081
[101]	validation_0-auc:0.93809
[102]	validation_0-auc:0.938167
[103]	validation_0-auc:0.938141
[104]	validation_0-auc:0.938139
[105]	validation_0-auc:0.938119
[106]	validation_0-auc:0.938126
[107]	validation_0-auc:0.938154
[108]	validation_0-auc:0.938158
[109]	validation_0-auc:0.93817
[110]	validation_0-auc:0.938196
[111]	validation_0-auc:0.938262
[112]	validation_0-auc:0.938293
[113]	validation_0-auc:0.938305
[114]	validation_0-auc:0.938369
[115]	validation_0-auc:0.938367
[116]	validation_0-auc:0.938367
[117]	validation_0-auc:0.938401
[118]	validation_0-auc:0.938429
[119]	validation_0-auc:0.938419
[120]	validation_0-auc:0.938437
[121]	validation_0-auc:0.938443
[122]	validation_0-auc:0.938456
[123]	validation_0-auc:0.938473
[124]	validation_0-auc:0.938479
[125]	validation_0-auc:0.938491
[126]	validation_0-auc:0.938482
[127]	validation_0-auc:0.93846
[128]	validation_0-auc:0.938495
[129]	validation_0-auc:0.938488
[130]	validation_0-auc:0.9385
[131]	validation_0-auc:0.93853
[132]	validation_0-auc:0.938552
[133]	validation_0-auc:0.938585
[134]	validation_0-auc:0.938607
[135]	validation_0-auc:0.93864
[136]	validation_0-auc:0.938645
[137]	validation_0-auc:0.938668
[138]	validation_0-auc:0.938665
[139]	validation_0-auc:0.93868
[140]	validation_0-auc:0.938708
[141]	validation_0-auc:0.938718
[142]	validation_0-auc:0.938734
[143]	validation_0-auc:0.938777
[144]	validation_0-auc:0.938764
[145]	validation_0-auc:0.938827
[146]	validation_0-auc:0.938803
[147]	validation_0-auc:0.93881
[148]	validation_0-auc:0.93885
[149]	validation_0-auc:0.938872
[150]	validation_0-auc:0.938908
[151]	validation_0-auc:0.938914
[152]	validation_0-auc:0.938941
[153]	validation_0-auc:0.93895
[154]	validation_0-auc:0.93895
[155]	validation_0-auc:0.938968
[156]	validation_0-auc:0.938952
[157]	validation_0-auc:0.938966
[158]	validation_0-auc:0.938982
[159]	validation_0-auc:0.938996
[160]	validation_0-auc:0.938984
[161]	validation_0-auc:0.938987
[162]	validation_0-auc:0.938988
[163]	validation_0-auc:0.939016
[164]	validation_0-auc:0.939026
[165]	validation_0-auc:0.93904
[166]	validation_0-auc:0.939057
[167]	validation_0-auc:0.939098
[168]	validation_0-auc:0.939114
[169]	validation_0-auc:0.939114
[170]	validation_0-auc:0.939115
[171]	validation_0-auc:0.939083
[172]	validation_0-auc:0.939096
[173]	validation_0-auc:0.93909
[174]	validation_0-auc:0.939081
[175]	validation_0-auc:0.939083
[176]	validation_0-auc:0.939079
[177]	validation_0-auc:0.939079
[178]	validation_0-auc:0.939088
[179]	validation_0-auc:0.939092
[180]	validation_0-auc:0.939095
[181]	validation_0-auc:0.939115
[182]	validation_0-auc:0.939136
[183]	validation_0-auc:0.939136
[184]	validation_0-auc:0.939124
[185]	validation_0-auc:0.939113
[186]	validation_0-auc:0.939116
[187]	validation_0-auc:0.939109
[188]	validation_0-auc:0.939096
[189]	validation_0-auc:0.939095
[190]	validation_0-auc:0.939118
[191]	validation_0-auc:0.939127
[192]	validation_0-auc:0.939139
[193]	validation_0-auc:0.939155
[194]	validation_0-auc:0.939154
[195]	validation_0-auc:0.939147
[196]	validation_0-auc:0.939153
[197]	validation_0-auc:0.939153
[198]	validation_0-auc:0.939135
[199]	validation_0-auc:0.939135
[200]	validation_0-auc:0.939139
[201]	validation_0-auc:0.939151
[202]	validation_0-auc:0.939155
[203]	validation_0-auc:0.939146
[204]	validation_0-auc:0.939145
[205]	validation_0-auc:0.939148
[206]	validation_0-auc:0.93914
[207]	validation_0-auc:0.939136
[208]	validation_0-auc:0.939147
[209]	validation_0-auc:0.939137
[210]	validation_0-auc:0.93914
[211]	validation_0-auc:0.939146
[212]	validation_0-auc:0.939136
[213]	validation_0-auc:0.939133
[214]	validation_0-auc:0.939133
[215]	validation_0-auc:0.939147
[216]	validation_0-auc:0.939147
[217]	validation_0-auc:0.939144
[218]	validation_0-auc:0.939154
[219]	validation_0-auc:0.939147
[220]	validation_0-auc:0.939156
[221]	validation_0-auc:0.939138
[222]	validation_0-auc:0.939149
[223]	validation_0-auc:0.939158
[224]	validation_0-auc:0.939163
[225]	validation_0-auc:0.939163
[226]	validation_0-auc:0.939181
[227]	validation_0-auc:0.939187
[228]	validation_0-auc:0.9392
[229]	validation_0-auc:0.939207
[230]	validation_0-auc:0.939216
[231]	validation_0-auc:0.939213
[232]	validation_0-auc:0.939218
[233]	validation_0-auc:0.939222
[234]	validation_0-auc:0.939232
[235]	validation_0-auc:0.939223
[236]	validation_0-auc:0.939217
[237]	validation_0-auc:0.939219
[238]	validation_0-auc:0.9392
[239]	validation_0-auc:0.939211
[240]	validation_0-auc:0.939212
[241]	validation_0-auc:0.9392
[242]	validation_0-auc:0.939197
[243]	validation_0-auc:0.939211
[244]	validation_0-auc:0.939213
[245]	validation_0-auc:0.939211
[246]	validation_0-auc:0.939221
[247]	validation_0-auc:0.939209
[248]	validation_0-auc:0.939207
[249]	validation_0-auc:0.939218
[250]	validation_0-auc:0.939217
[251]	validation_0-auc:0.939236
[252]	validation_0-auc:0.939246
[253]	validation_0-auc:0.939244
[254]	validation_0-auc:0.939242
[255]	validation_0-auc:0.939254
[256]	validation_0-auc:0.939257
[257]	validation_0-auc:0.93926
[258]	validation_0-auc:0.939265
[259]	validation_0-auc:0.939283
[260]	validation_0-auc:0.939285
[261]	validation_0-auc:0.939297
[262]	validation_0-auc:0.939304
[263]	validation_0-auc:0.939305
[264]	validation_0-auc:0.939313
[265]	validation_0-auc:0.939312
[266]	validation_0-auc:0.939311
[267]	validation_0-auc:0.939298
[268]	validation_0-auc:0.939313
[269]	validation_0-auc:0.939321
[270]	validation_0-auc:0.939323
[271]	validation_0-auc:0.939324
[272]	validation_0-auc:0.939323
[273]	validation_0-auc:0.939305
[274]	validation_0-auc:0.939321
[275]	validation_0-auc:0.939314
[276]	validation_0-auc:0.939326
[277]	validation_0-auc:0.939327
[278]	validation_0-auc:0.93933
[279]	validation_0-auc:0.939342
[280]	validation_0-auc:0.939344
[281]	validation_0-auc:0.939339
[282]	validation_0-auc:0.93935
[283]	validation_0-auc:0.939341
[284]	validation_0-auc:0.939357
[285]	validation_0-auc:0.939344
[286]	validation_0-auc:0.939349
[287]	validation_0-auc:0.939359
[288]	validation_0-auc:0.93936
[289]	validation_0-auc:0.939369
[290]	validation_0-auc:0.93937
[291]	validation_0-auc:0.939369
[292]	validation_0-auc:0.939362
[293]	validation_0-auc:0.93937
[294]	validation_0-auc:0.939363
[295]	validation_0-auc:0.939373
[296]	validation_0-auc:0.939395
[297]	validation_0-auc:0.939399
[298]	validation_0-auc:0.939402
[299]	validation_0-auc:0.9394
[300]	validation_0-auc:0.93941
[301]	validation_0-auc:0.939403
[302]	validation_0-auc:0.939422
[303]	validation_0-auc:0.939415
[304]	validation_0-auc:0.93942
[305]	validation_0-auc:0.939423
[306]	validation_0-auc:0.939445
[307]	validation_0-auc:0.939441
[308]	validation_0-auc:0.939449
[309]	validation_0-auc:0.939446
[310]	validation_0-auc:0.939433
[311]	validation_0-auc:0.939446
[312]	validation_0-auc:0.939453
[313]	validation_0-auc:0.939449
[314]	validation_0-auc:0.939467
[315]	validation_0-auc:0.939481
[316]	validation_0-auc:0.939494
[317]	validation_0-auc:0.939492
[318]	validation_0-auc:0.93949
[319]	validation_0-auc:0.939509
[320]	validation_0-auc:0.939515
[321]	validation_0-auc:0.939512
[322]	validation_0-auc:0.939526
[323]	validation_0-auc:0.939536
[324]	validation_0-auc:0.939537
[325]	validation_0-auc:0.93953
[326]	validation_0-auc:0.939526
[327]	validation_0-auc:0.939532
[328]	validation_0-auc:0.939535
[329]	validation_0-auc:0.939557
[330]	validation_0-auc:0.939567
[331]	validation_0-auc:0.939558
[332]	validation_0-auc:0.939566
[333]	validation_0-auc:0.939565
[334]	validation_0-auc:0.939584
[335]	validation_0-auc:0.93958
[336]	validation_0-auc:0.939578
[337]	validation_0-auc:0.939573
[338]	validation_0-auc:0.93957
[339]	validation_0-auc:0.939557
[340]	validation_0-auc:0.939575
[341]	validation_0-auc:0.93958
[342]	validation_0-auc:0.939589
[343]	validation_0-auc:0.939593
[344]	validation_0-auc:0.939595
[345]	validation_0-auc:0.939604
[346]	validation_0-auc:0.939612
[347]	validation_0-auc:0.939606
[348]	validation_0-auc:0.939613
[349]	validation_0-auc:0.939613
[350]	validation_0-auc:0.939613
[351]	validation_0-auc:0.939624
[352]	validation_0-auc:0.939613
[353]	validation_0-auc:0.939626
[354]	validation_0-auc:0.939633
[355]	validation_0-auc:0.939636
[356]	validation_0-auc:0.939626
[357]	validation_0-auc:0.939618
[358]	validation_0-auc:0.939628
[359]	validation_0-auc:0.939638
[360]	validation_0-auc:0.939636
[361]	validation_0-auc:0.939653
[362]	validation_0-auc:0.939666
[363]	validation_0-auc:0.939678
[364]	validation_0-auc:0.939689
[365]	validation_0-auc:0.939695
[366]	validation_0-auc:0.939698
[367]	validation_0-auc:0.939681
[368]	validation_0-auc:0.93968
[369]	validation_0-auc:0.939688
[370]	validation_0-auc:0.939695
[371]	validation_0-auc:0.939704
[372]	validation_0-auc:0.939708
[373]	validation_0-auc:0.939708
[374]	validation_0-auc:0.939712
[375]	validation_0-auc:0.93972
[376]	validation_0-auc:0.939721
[377]	validation_0-auc:0.93972
[378]	validation_0-auc:0.939723
[379]	validation_0-auc:0.939722
[380]	validation_0-auc:0.939734
[381]	validation_0-auc:0.939733
[382]	validation_0-auc:0.93971
[383]	validation_0-auc:0.939689
[384]	validation_0-auc:0.939696
[385]	validation_0-auc:0.939695
[386]	validation_0-auc:0.939698
[387]	validation_0-auc:0.939716
[388]	validation_0-auc:0.939718
[389]	validation_0-auc:0.939709
[390]	validation_0-auc:0.939714
[391]	validation_0-auc:0.93972
[392]	validation_0-auc:0.939737
[393]	validation_0-auc:0.939732
[394]	validation_0-auc:0.939741
[395]	validation_0-auc:0.939746
[396]	validation_0-auc:0.939751
[397]	validation_0-auc:0.939749
[398]	validation_0-auc:0.939792
[399]	validation_0-auc:0.939804
[400]	validation_0-auc:0.939803
[401]	validation_0-auc:0.939811
[402]	validation_0-auc:0.939814
[403]	validation_0-auc:0.939804
[404]	validation_0-auc:0.939802
[405]	validation_0-auc:0.939807
[406]	validation_0-auc:0.93982
[407]	validation_0-auc:0.939816
[408]	validation_0-auc:0.939809
[409]	validation_0-auc:0.9398
[410]	validation_0-auc:0.939806
[411]	validation_0-auc:0.939805
[412]	validation_0-auc:0.939797
[413]	validation_0-auc:0.939792
[414]	validation_0-auc:0.939791
[415]	validation_0-auc:0.939789
[416]	validation_0-auc:0.939818
[417]	validation_0-auc:0.939819
[418]	validation_0-auc:0.93983
[419]	validation_0-auc:0.939833
[420]	validation_0-auc:0.939842
[421]	validation_0-auc:0.939855
[422]	validation_0-auc:0.939857
[423]	validation_0-auc:0.939854
[424]	validation_0-auc:0.939866
[425]	validation_0-auc:0.939878
[426]	validation_0-auc:0.939871
[427]	validation_0-auc:0.939868
[428]	validation_0-auc:0.939871
[429]	validation_0-auc:0.939856
[430]	validation_0-auc:0.939855
[431]	validation_0-auc:0.93986
[432]	validation_0-auc:0.939869
[433]	validation_0-auc:0.939878
[434]	validation_0-auc:0.939865
[435]	validation_0-auc:0.939857
[436]	validation_0-auc:0.939877
[437]	validation_0-auc:0.939869
[438]	validation_0-auc:0.939875
[439]	validation_0-auc:0.939871
[440]	validation_0-auc:0.93987
[441]	validation_0-auc:0.939875
[442]	validation_0-auc:0.939866
[443]	validation_0-auc:0.939879
[444]	validation_0-auc:0.939872
[445]	validation_0-auc:0.939867
[446]	validation_0-auc:0.939875
[447]	validation_0-auc:0.939864
[448]	validation_0-auc:0.939879
[449]	validation_0-auc:0.939888
[450]	validation_0-auc:0.939893
[451]	validation_0-auc:0.939888
[452]	validation_0-auc:0.939888
[453]	validation_0-auc:0.939892
[454]	validation_0-auc:0.939894
[455]	validation_0-auc:0.9399
[456]	validation_0-auc:0.939899
[457]	validation_0-auc:0.939889
[458]	validation_0-auc:0.939903
[459]	validation_0-auc:0.939915
[460]	validation_0-auc:0.939913
[461]	validation_0-auc:0.939927
[462]	validation_0-auc:0.939934
[463]	validation_0-auc:0.939943
[464]	validation_0-auc:0.939955
[465]	validation_0-auc:0.939958
[466]	validation_0-auc:0.939973
[467]	validation_0-auc:0.939955
[468]	validation_0-auc:0.939961
[469]	validation_0-auc:0.939963
[470]	validation_0-auc:0.93996
[471]	validation_0-auc:0.939954
[472]	validation_0-auc:0.939953
[473]	validation_0-auc:0.93996
[474]	validation_0-auc:0.939973
[475]	validation_0-auc:0.939973
[476]	validation_0-auc:0.939966
[477]	validation_0-auc:0.939955
[478]	validation_0-auc:0.93995
[479]	validation_0-auc:0.939956
[480]	validation_0-auc:0.939946
[481]	validation_0-auc:0.939945
[482]	validation_0-auc:0.939939
[483]	validation_0-auc:0.939939
[484]	validation_0-auc:0.939938
[485]	validation_0-auc:0.939932
[486]	validation_0-auc:0.939943
[487]	validation_0-auc:0.939984
[488]	validation_0-auc:0.939996
[489]	validation_0-auc:0.939994
[490]	validation_0-auc:0.939983
[491]	validation_0-auc:0.939979
[492]	validation_0-auc:0.93996
[493]	validation_0-auc:0.939967
[494]	validation_0-auc:0.939952
[495]	validation_0-auc:0.939967
[496]	validation_0-auc:0.939975
[497]	validation_0-auc:0.939975
[498]	validation_0-auc:0.94001
[499]	validation_0-auc:0.940014
[500]	validation_0-auc:0.94002
[501]	validation_0-auc:0.940017
[502]	validation_0-auc:0.940018
[503]	validation_0-auc:0.940022
[504]	validation_0-auc:0.940018
[505]	validation_0-auc:0.940028
[506]	validation_0-auc:0.940031
[507]	validation_0-auc:0.940037
[508]	validation_0-auc:0.940056
[509]	validation_0-auc:0.940036
[510]	validation_0-auc:0.940029
[511]	validation_0-auc:0.940032
[512]	validation_0-auc:0.940027
[513]	validation_0-auc:0.940037
[514]	validation_0-auc:0.940022
[515]	validation_0-auc:0.940029
[516]	validation_0-auc:0.940037
[517]	validation_0-auc:0.940039
[518]	validation_0-auc:0.940026
[519]	validation_0-auc:0.940029
[520]	validation_0-auc:0.940019
[521]	validation_0-auc:0.940022
[522]	validation_0-auc:0.94003
[523]	validation_0-auc:0.940045
[524]	validation_0-auc:0.94004
[525]	validation_0-auc:0.940037
[526]	validation_0-auc:0.940034
[527]	validation_0-auc:0.939997
[528]	validation_0-auc:0.940001
[529]	validation_0-auc:0.939999
[530]	validation_0-auc:0.940004
[531]	validation_0-auc:0.940011
[532]	validation_0-auc:0.940002
[533]	validation_0-auc:0.940015
[534]	validation_0-auc:0.940016
[535]	validation_0-auc:0.940009
[536]	validation_0-auc:0.940022
[537]	validation_0-auc:0.940025
[538]	validation_0-auc:0.940046
Stopping. Best iteration:
[508]	validation_0-auc:0.940056

training time: 339.848 s
AUC:  0.83
             precision    recall  f1-score   support

          0       0.89      0.72      0.80      7349
          1       0.79      0.92      0.85      8548

avg / total       0.84      0.83      0.83     15897

# save the trained classifier from sklearn.externals import joblib filepath = '~' filename = '20170425_IPns.pkl' joblib.dump(xgbc, filepath+filename) # xgbc_load = joblib.load('filename.pkl')

Visualize Important Features that Differentiate Good vs. Bad Sequences

Hydrophobicity @ position 4
Rigidity @ position 4
Surface Area @ position 6 & 20
Molecule Weight @ position 20
In [40]:
# print "important features are: \n", xgbc.feature_importances_.reshape([num_aa, num_chemprop])
imp_features = xgbc.feature_importances_.reshape([num_aa, num_chemprop])
imp_features = pd.DataFrame(imp_features).T
imp_features.index = aaindex_7.columns.tolist()

plt.rcParams['figure.figsize'] = (17, 3)
fig, ax1 = plt.subplots(nrows=1, ncols=1)
fig.suptitle('Importance of AA Positions and Chemical Properties')
sns.heatmap(imp_features); ax1.set_xlabel('amino acid position')
Out[40]:
<matplotlib.text.Text at 0x121c8ef10>
In [41]:
# labels_test = np.array(labels_test)
idx_test_0 = labels_test == 0
print "True Label - Good  :", labels_test[idx_test_0][:10]
print "The prediction says:", xgbc.predict(features_test)[idx_test_0][:10]
print "The prediction prob:", xgbc.predict_proba(features_test)[idx_test_0][:10]
print ""
idx_test_1 = labels_test == 1
print "True Label - Bad   :", labels_test[idx_test_1][:10]
print "The prediction says:", xgbc.predict(features_test)[idx_test_1][:10]
print "The prediction prob:", xgbc.predict_proba(features_test)[idx_test_1][:10]
True Label - Good  : [0 0 0 0 0 0 0 0 0 0]
The prediction says: [1 0 0 0 1 0 1 0 1 1]
The prediction prob: [[ 0.35240304  0.64759696]
 [ 0.73139787  0.26860216]
 [ 0.72306132  0.27693865]
 [ 0.64800191  0.35199806]
 [ 0.0461176   0.9538824 ]
 [ 0.69444454  0.30555546]
 [ 0.45232183  0.54767817]
 [ 0.80561113  0.19438888]
 [ 0.39005452  0.60994548]
 [ 0.3764984   0.6235016 ]]

True Label - Bad   : [1 1 1 1 1 1 1 1 1 1]
The prediction says: [1 1 0 1 1 1 1 1 1 1]
The prediction prob: [[ 0.00458264  0.99541736]
 [ 0.00904483  0.99095517]
 [ 0.6709379   0.32906213]
 [ 0.0029332   0.9970668 ]
 [ 0.00294471  0.99705529]
 [ 0.00292981  0.99707019]
 [ 0.35621434  0.64378566]
 [ 0.00786704  0.99213296]
 [ 0.01465297  0.98534703]
 [ 0.00292981  0.99707019]]

Visualize the Decision Trees

In [110]:
# need to install graphviz @ terminal: brew install graphviz
#pdf_file = '~/xgbc_trees.pdf'
#plot_tree(xgbc, num_trees=0)
#plt.show()

gvfile = '~/xgbc_trees.gv'
dot_data = xgb.to_graphviz(xgbc, num_trees=0)
dot_data.render(gvfile, view=False)
dot_data
Out[110]:
%3 0 f178<0.870073 1 f29<0.106452 0->1 yes, missing 2 f166<0.539147 0->2 no 3 f64<0.583435 1->3 yes, missing 4 f72<0.126 1->4 no 5 f224<0.958 2->5 yes, missing 6 f29<0.106452 2->6 no 7 f4<0.830901 3->7 yes, missing 8 f38<0.005 3->8 no 9 f120<0.32 4->9 yes, missing 10 f40<0.649341 4->10 no 15 f213<0.3495 7->15 yes, missing 16 leaf=0.0167742 7->16 no 17 f4<0.588258 8->17 yes, missing 18 f30<0.5615 8->18 no 31 f229<0.4335 15->31 yes, missing 32 f148<0.539147 15->32 no 61 leaf=0.0172222 31->61 yes, missing 62 leaf=0.0025641 31->62 no 63 f100<0.644616 32->63 yes, missing 64 f89<0.380645 32->64 no 107 f95<0.235484 63->107 yes, missing 108 f90<0.32 63->108 no 109 f184<0.745165 64->109 yes, missing 110 leaf=-0.0189333 64->110 no 165 f135<0.3945 107->165 yes, missing 166 f181<0.5 107->166 no 167 f130<0.539147 108->167 yes, missing 168 f197<0.83871 108->168 no 249 f168<0.4325 165->249 yes, missing 250 leaf=0.00193548 165->250 no 251 f40<0.539147 166->251 yes, missing 252 f84<0.1275 166->252 no 369 leaf=-0.0189744 249->369 yes, missing 370 leaf=-0.00285714 249->370 no 371 leaf=-0.0172414 251->371 yes, missing 372 leaf=-0.00285714 251->372 no 373 leaf=0.0114286 252->373 yes, missing 374 leaf=0.00344828 252->374 no 253 leaf=0.0157447 167->253 yes, missing 254 f223<0.5 167->254 no 255 f19<0.581 168->255 yes, missing 256 leaf=-0.0103448 168->256 no 375 leaf=-0.0110843 254->375 yes, missing 376 leaf=0.00317757 254->376 no 377 leaf=-0.00344828 255->377 yes, missing 378 leaf=0.00965035 255->378 no 169 leaf=-0.0117647 109->169 yes, missing 170 leaf=0.0075 109->170 no 33 f149<0.396774 17->33 yes, missing 34 f91<0.2045 17->34 no 35 f13<0.2875 18->35 yes, missing 36 leaf=-0.0175 18->36 no 65 leaf=-0.015625 33->65 yes, missing 66 f68<0.478 33->66 no 67 f135<0.377 34->67 yes, missing 68 f31<0.3965 34->68 no 113 leaf=0.0176471 66->113 yes, missing 114 f178<0.401924 66->114 no 171 f90<0.4235 114->171 yes, missing 172 leaf=-0.00204082 114->172 no 257 leaf=-0.00111111 171->257 yes, missing 258 leaf=0.0130435 171->258 no 115 leaf=0.0198321 67->115 yes, missing 116 leaf=0.0108571 67->116 no 117 leaf=-0.0126316 68->117 yes, missing 118 f14<0.031 68->118 no 173 f43<0.387 118->173 yes, missing 174 leaf=-0.00628571 118->174 no 259 f202<0.683788 173->259 yes, missing 260 f4<0.745165 173->260 no 379 leaf=-0.004 259->379 yes, missing 380 leaf=0.0136364 259->380 no 381 leaf=0.0088 260->381 yes, missing 382 leaf=0.0173219 260->382 no 69 leaf=-0.0171429 35->69 yes, missing 70 f121<0.8005 35->70 no 119 f99<0.3945 70->119 yes, missing 120 leaf=0.017551 70->120 no 175 f192<0.2125 119->175 yes, missing 176 f72<0.1085 119->176 no 261 f157<0.4395 175->261 yes, missing 262 f187<0.2875 175->262 no 263 leaf=0.00428571 176->263 yes, missing 264 leaf=-0.0178947 176->264 no 383 leaf=0.018 261->383 yes, missing 384 leaf=0.00780488 261->384 no 385 leaf=-0.0173333 262->385 yes, missing 386 leaf=0.0052459 262->386 no 19 f87<0.4 9->19 yes, missing 20 f91<0.2045 9->20 no 21 f68<0.954 10->21 yes, missing 22 f120<0.5585 10->22 no 37 f116<0.0305 19->37 yes, missing 38 f2<0.512 19->38 no 39 f80<0.0015 20->39 yes, missing 40 f2<0.954 20->40 no 71 f225<0.3495 37->71 yes, missing 72 f4<0.784287 37->72 no 73 f162<0.4325 38->73 yes, missing 74 f39<0.363 38->74 no 121 f91<0.2875 71->121 yes, missing 122 f168<0.482 71->122 no 123 f97<0.8005 72->123 yes, missing 124 f225<0.403 72->124 no 177 f204<0.2125 121->177 yes, missing 178 f27<0.4015 121->178 no 179 f132<0.482 122->179 yes, missing 180 f147<0.403 122->180 no 265 f68<0.958 177->265 yes, missing 266 f218<0.0485 177->266 no 267 leaf=-0.0178378 178->267 yes, missing 268 leaf=0.00125 178->268 no 387 leaf=0.00769231 265->387 yes, missing 388 leaf=0.0146667 265->388 no 389 leaf=0.0072 266->389 yes, missing 390 leaf=-0.00338462 266->390 no 269 f199<0.251 179->269 yes, missing 270 f223<0.251 179->270 no 271 leaf=-0.00285714 180->271 yes, missing 272 leaf=-0.0186441 180->272 no 391 leaf=-0.00969325 269->391 yes, missing 392 leaf=0.000215054 269->392 no 393 leaf=0.0112195 270->393 yes, missing 394 leaf=-0.00222222 270->394 no 181 f168<0.5585 123->181 yes, missing 182 f186<0.0885 123->182 no 183 f231<0.363 124->183 yes, missing 184 f166<0.578441 124->184 no 273 f32<0.005 181->273 yes, missing 274 leaf=-0.00285714 181->274 no 275 leaf=0.00378378 182->275 yes, missing 276 leaf=-0.0109677 182->276 no 395 leaf=-0.01 273->395 yes, missing 396 leaf=-0.0181208 273->396 no 277 f171<0.4055 183->277 yes, missing 278 f194<0.0015 183->278 no 279 f43<0.4435 184->279 yes, missing 280 f43<0.4435 184->280 no 397 leaf=-0.0107965 277->397 yes, missing 398 leaf=-0.00298056 277->398 no 399 leaf=-0.00857143 278->399 yes, missing 400 leaf=-0.0190244 278->400 no 401 leaf=0.0109434 279->401 yes, missing 402 leaf=-0.004 279->402 no 403 leaf=-0.0174194 280->403 yes, missing 404 leaf=0.00125 280->404 no 125 f120<0.0985 73->125 yes, missing 126 f221<0.690323 73->126 no 127 f179<0.33871 74->127 yes, missing 128 f114<0.0985 74->128 no 185 f165<0.4015 125->185 yes, missing 186 f147<0.4 125->186 no 187 f15<0.3515 126->187 yes, missing 188 leaf=-0.000571429 126->188 no 281 leaf=-0.0186207 185->281 yes, missing 282 leaf=-0.00344828 185->282 no 283 f196<0.539147 186->283 yes, missing 284 f165<0.4055 186->284 no 405 leaf=0.000769231 283->405 yes, missing 406 leaf=-0.0154098 283->406 no 407 leaf=0.00148148 284->407 yes, missing 408 leaf=0.0112903 284->408 no 285 leaf=-0.0190123 187->285 yes, missing 286 leaf=-0.00451613 187->286 no 189 f163<0.341 127->189 yes, missing 190 f68<0.958 127->190 no 191 f154<0.61279 128->191 yes, missing 192 f236<0.958 128->192 no 287 f203<0.33871 189->287 yes, missing 288 leaf=-0.00545455 189->288 no 289 leaf=0.0112195 190->289 yes, missing 290 f103<0.251 190->290 no 409 leaf=0.00344828 287->409 yes, missing 410 leaf=0.0155556 287->410 no 411 leaf=-0.000714286 290->411 yes, missing 412 leaf=-0.0117757 290->412 no 291 f179<0.732258 191->291 yes, missing 292 f221<0.0870968 191->292 no 293 f27<0.3495 192->293 yes, missing 294 f118<0.649341 192->294 no 413 leaf=0.0157647 291->413 yes, missing 414 leaf=0.00666667 291->414 no 415 leaf=0.0080597 292->415 yes, missing 416 leaf=-0 292->416 no 417 leaf=8.96861e-05 293->417 yes, missing 418 leaf=0.00611111 293->418 no 419 leaf=-0.0137778 294->419 yes, missing 420 leaf=0.00571429 294->420 no 75 f0<0.71 39->75 yes, missing 76 f120<0.4325 39->76 no 77 f121<0.581 40->77 yes, missing 78 f36<0.0885 40->78 no 129 f204<0.5125 75->129 yes, missing 130 f132<0.3955 75->130 no 131 f187<0.742 76->131 yes, missing 132 f180<0.5585 76->132 no 193 leaf=-0 129->193 yes, missing 194 leaf=-0.0179487 129->194 no 195 leaf=-0.01125 130->195 yes, missing 196 f227<0.235484 130->196 no 295 f178<0.61279 196->295 yes, missing 296 f165<0.377 196->296 no 421 leaf=-0.01 295->421 yes, missing 422 leaf=0.0114286 295->422 no 423 leaf=0.0112915 296->423 yes, missing 424 leaf=0.00625 296->424 no 197 leaf=0.00125 131->197 yes, missing 198 leaf=0.015 131->198 no 199 leaf=0.0194203 132->199 yes, missing 200 leaf=0.0103448 132->200 no 133 f168<0.126 77->133 yes, missing 134 f2<0.005 77->134 no 135 f89<0.396774 78->135 yes, missing 136 f240<0.0985 78->136 no 201 f167<0.529032 133->201 yes, missing 202 f134<0.036 133->202 no 203 f153<0.403 134->203 yes, missing 204 f39<0.381 134->204 no 297 f94<0.68369 201->297 yes, missing 298 f10<0.578441 201->298 no 299 f98<0.958 202->299 yes, missing 300 f225<0.361 202->300 no 425 leaf=-0.0190361 297->425 yes, missing 426 leaf=-0.01 297->426 no 427 leaf=0.00933333 298->427 yes, missing 428 leaf=-0.00545455 298->428 no 429 leaf=-0.00455137 299->429 yes, missing 430 leaf=0.00696133 299->430 no 431 leaf=0.00854545 300->431 yes, missing 432 leaf=-0.00457143 300->432 no 301 leaf=-0.0025641 203->301 yes, missing 302 f100<0.61279 203->302 no 303 f214<0.47542 204->303 yes, missing 304 f211<0.2875 204->304 no 433 leaf=0.00315789 302->433 yes, missing 434 leaf=0.0111675 302->434 no 435 leaf=0.0155556 303->435 yes, missing 436 leaf=0.000357143 303->436 no 437 leaf=-0.0183333 304->437 yes, missing 438 leaf=0.000952381 304->438 no 205 leaf=-0.00344828 135->205 yes, missing 206 f136<0.8038 135->206 no 207 f26<0.0155 136->207 yes, missing 208 f214<0.578441 136->208 no 305 f12<0.104 206->305 yes, missing 306 leaf=0.00514286 206->306 no 439 leaf=0.0108571 305->439 yes, missing 440 leaf=0.0191189 305->440 no 307 leaf=0.0075 207->307 yes, missing 308 f102<0.4235 207->308 no 309 f6<0.1735 208->309 yes, missing 310 f26<0.0015 208->310 no 441 leaf=-0.0182979 308->441 yes, missing 442 leaf=-0.00457143 308->442 no 443 leaf=-0.0072 309->443 yes, missing 444 leaf=0.0052331 309->444 no 445 leaf=0.0139394 310->445 yes, missing 446 leaf=0.00778464 310->446 no 41 f205<0.251 21->41 yes, missing 42 f29<0.996774 21->42 no 43 f236<0.51 22->43 yes, missing 44 f26<0.0485 22->44 no 79 f153<0.363 41->79 yes, missing 80 f128<0.0485 41->80 no 81 f117<0.361 42->81 yes, missing 82 leaf=-0.0177143 42->82 no 137 f221<0.106452 79->137 yes, missing 138 f119<0.529032 79->138 no 139 f18<0.0985 80->139 yes, missing 140 f59<0.716129 80->140 no 209 leaf=0.00909091 137->209 yes, missing 210 f10<0.61279 137->210 no 211 f2<0.996 138->211 yes, missing 212 f88<0.61279 138->212 no 311 f138<0.1275 210->311 yes, missing 312 f211<0.2045 210->312 no 447 leaf=0.00933333 311->447 yes, missing 448 leaf=-0.00561798 311->448 no 449 leaf=-0.0054902 312->449 yes, missing 450 leaf=-0.0154054 312->450 no 313 f100<0.830901 211->313 yes, missing 314 f31<0.569 211->314 no 315 leaf=-0.0186207 212->315 yes, missing 316 leaf=-0.00344828 212->316 no 451 leaf=0.000220318 313->451 yes, missing 452 leaf=-0.0104348 313->452 no 453 leaf=-0.00545455 314->453 yes, missing 454 leaf=0.00973822 314->454 no 213 leaf=0.00723404 139->213 yes, missing 214 f25<0.251 139->214 no 215 f185<0.219355 140->215 yes, missing 216 f12<0.126 140->216 no 317 f99<0.363 214->317 yes, missing 318 f161<0.190323 214->318 no 455 leaf=-0.000298507 317->455 yes, missing 456 leaf=-0.00923077 317->456 no 457 leaf=-0.0046729 318->457 yes, missing 458 leaf=-0.0146436 318->458 no 319 leaf=0.015 215->319 yes, missing 320 leaf=0.00428571 215->320 no 321 f193<0.2875 216->321 yes, missing 322 f199<0.4625 216->322 no 459 leaf=0.00267943 321->459 yes, missing 460 leaf=-0.00324786 321->460 no 461 leaf=-0.00452187 322->461 yes, missing 462 leaf=-0.0190698 322->462 no 141 f91<0.2875 81->141 yes, missing 142 f13<0.42 81->142 no 217 f16<0.730304 141->217 yes, missing 218 f1<0.4155 141->218 no 219 f87<0.4 142->219 yes, missing 220 f148<0.686089 142->220 no 323 leaf=0.0112195 217->323 yes, missing 324 leaf=-0.00451613 217->324 no 325 leaf=-0.0181818 218->325 yes, missing 326 leaf=-0.00344828 218->326 no 327 f207<0.368 219->327 yes, missing 328 f184<0.644616 219->328 no 329 f243<0.3495 220->329 yes, missing 330 f161<0.190323 220->330 no 463 leaf=-0.00166667 327->463 yes, missing 464 leaf=-0.0181818 327->464 no 465 leaf=0.0111111 328->465 yes, missing 466 leaf=-0.00545455 328->466 no 467 leaf=0.00164589 329->467 yes, missing 468 leaf=0.00658683 329->468 no 469 leaf=0.0103448 330->469 yes, missing 470 leaf=-0.0111111 330->470 no 83 f197<0.912903 43->83 yes, missing 84 f224<0.958 43->84 no 85 f198<0.261 44->85 yes, missing 86 f185<0.293548 44->86 no 143 f132<0.32 83->143 yes, missing 144 f114<0.0985 83->144 no 145 f128<0.0015 84->145 yes, missing 146 f52<0.830901 84->146 no 221 f165<0.1705 143->221 yes, missing 222 f100<0.943593 143->222 no 223 leaf=0.0127273 144->223 yes, missing 224 f102<0.4235 144->224 no 331 leaf=0.0117647 221->331 yes, missing 332 f29<0.33871 221->332 no 333 f195<0.4 222->333 yes, missing 334 leaf=-0.00162162 222->334 no 471 leaf=-0.0126563 332->471 yes, missing 472 leaf=-0.00142857 332->472 no 473 leaf=-0.0186777 333->473 yes, missing 474 leaf=-0.0128276 333->474 no 335 f184<0.47542 224->335 yes, missing 336 leaf=0.00514286 224->336 no 475 leaf=-0.00166667 335->475 yes, missing 476 leaf=-0.0182979 335->476 no 225 f86<0.005 145->225 yes, missing 226 f171<0.3345 145->226 no 227 f193<0.443 146->227 yes, missing 228 f185<0.0870968 146->228 no 337 leaf=-0.004 225->337 yes, missing 338 f181<0.5 225->338 no 339 leaf=-0.0188889 226->339 yes, missing 340 f133<0.42 226->340 no 477 leaf=0.00193548 338->477 yes, missing 478 leaf=0.0160976 338->478 no 479 leaf=-0.0106452 340->479 yes, missing 480 leaf=-0.00460102 340->480 no 341 f138<0.0985 227->341 yes, missing 342 f103<0.251 227->342 no 343 leaf=-0.00380952 228->343 yes, missing 344 leaf=-0.0195031 228->344 no 481 leaf=-0.00518919 341->481 yes, missing 482 leaf=-0.0117343 341->482 no 483 leaf=-0.00736842 342->483 yes, missing 484 leaf=0.0121739 342->484 no 147 f52<0.548915 85->147 yes, missing 148 f148<0.568673 85->148 no 149 leaf=-0.0106667 86->149 yes, missing 150 f99<0.403 86->150 no 229 f152<0.954 147->229 yes, missing 230 f50<0.0315 147->230 no 231 f238<0.649341 148->231 yes, missing 232 leaf=-0.0186441 148->232 no 345 leaf=-0.0172414 229->345 yes, missing 346 f31<0.388 229->346 no 347 f144<0.426 230->347 yes, missing 348 leaf=0.015 230->348 no 485 leaf=0.00428571 346->485 yes, missing 486 leaf=-0.00763636 346->486 no 487 leaf=-0.001875 347->487 yes, missing 488 leaf=0.01 347->488 no 349 leaf=0.00588235 231->349 yes, missing 350 f199<0.8005 231->350 no 489 leaf=-0.0131707 350->489 yes, missing 490 leaf=-0.00434783 350->490 no 233 f180<0.2125 150->233 yes, missing 234 leaf=0.015814 150->234 no 351 leaf=-0.00338462 233->351 yes, missing 352 f199<0.9735 233->352 no 491 leaf=0.0097561 352->491 yes, missing 492 leaf=0.00337449 352->492 no 11 f215<0.716129 5->11 yes, missing 12 f193<0.0845 5->12 no 13 f72<0.2125 6->13 yes, missing 14 f184<0.47542 6->14 no 23 f172<0.401924 11->23 yes, missing 24 f180<0.4325 11->24 no 25 f25<0.2875 12->25 yes, missing 26 f163<0.251 12->26 no 45 leaf=0.00125 23->45 yes, missing 46 f222<0.5615 23->46 no 47 f125<0.422581 24->47 yes, missing 48 leaf=-0.0109677 24->48 no 87 f64<0.737673 46->87 yes, missing 88 leaf=0.00428571 46->88 no 151 f223<0.2875 87->151 yes, missing 152 f29<0.106452 87->152 no 235 f199<0.4995 151->235 yes, missing 236 leaf=0.00344828 151->236 no 237 leaf=0.0199586 152->237 yes, missing 238 f159<0.4055 152->238 no 355 leaf=0.0191032 235->355 yes, missing 356 leaf=0.0103448 235->356 no 357 leaf=0.0196308 238->357 yes, missing 358 leaf=0.0121739 238->358 no 89 leaf=0.0083871 47->89 yes, missing 90 leaf=0.0189189 47->90 no 49 leaf=0.00297872 25->49 yes, missing 50 leaf=-0.014717 25->50 no 51 f120<0.32 26->51 yes, missing 52 f68<0.514 26->52 no 91 leaf=0.0085 51->91 yes, missing 92 f36<0.4845 51->92 no 93 leaf=-0.00588235 52->93 yes, missing 94 leaf=0.00918919 52->94 no 153 leaf=0.0184314 92->153 yes, missing 154 leaf=0.00918919 92->154 no 27 f10<0.539147 13->27 yes, missing 28 f90<0.3955 13->28 no 29 f238<0.683788 14->29 yes, missing 30 f240<0.3695 14->30 no 53 leaf=0.0075 27->53 yes, missing 54 f53<0.396774 27->54 no 55 leaf=-0.008 28->55 yes, missing 56 f125<0.396774 28->56 no 95 leaf=0.0195356 54->95 yes, missing 96 leaf=0.01 54->96 no 97 leaf=0.00266667 56->97 yes, missing 98 leaf=0.0127273 56->98 no 57 f36<0.5125 29->57 yes, missing 58 f5<0.380645 29->58 no 59 f120<0.3695 30->59 yes, missing 60 f82<0.578441 30->60 no 99 f222<0.222 57->99 yes, missing 100 leaf=-0.00210526 57->100 no 101 f10<0.61279 58->101 yes, missing 102 leaf=-0.0178947 58->102 no 155 f138<0.1085 99->155 yes, missing 156 f157<0.2875 99->156 no 239 leaf=0.00344828 155->239 yes, missing 240 f13<0.2875 155->240 no 241 leaf=-0 156->241 yes, missing 242 leaf=0.0127273 156->242 no 359 leaf=0.0112195 240->359 yes, missing 360 f53<0.529032 240->360 no 495 leaf=0.0185657 360->495 yes, missing 496 leaf=0.012 360->496 no 157 f99<0.377 101->157 yes, missing 158 leaf=-0.0128205 101->158 no 243 leaf=0.00344828 157->243 yes, missing 244 leaf=0.0130864 157->244 no 103 leaf=-0.0187879 59->103 yes, missing 104 f103<0.2875 59->104 no 105 f123<0.368 60->105 yes, missing 106 f234<0.5125 60->106 no 159 leaf=-0.0115152 104->159 yes, missing 160 leaf=0.00933333 104->160 no 161 f7<0.4145 105->161 yes, missing 162 f227<0.33871 105->162 no 163 leaf=0.0176 106->163 yes, missing 164 leaf=0.00428571 106->164 no 245 f20<0.0485 161->245 yes, missing 246 f225<0.363 161->246 no 247 f36<0.482 162->247 yes, missing 248 f159<0.3815 162->248 no 361 f72<0.1395 245->361 yes, missing 362 f17<0.987097 245->362 no 363 f84<0.0985 246->363 yes, missing 364 leaf=-0.0178947 246->364 no 497 leaf=0.0084878 361->497 yes, missing 498 leaf=-0.00176 361->498 no 499 leaf=-0.00285714 362->499 yes, missing 500 leaf=-0.0171429 362->500 no 501 leaf=0.0108571 363->501 yes, missing 502 leaf=-0.00423529 363->502 no 365 f26<0.0665 247->365 yes, missing 366 leaf=0.00266667 247->366 no 367 leaf=0.0148387 248->367 yes, missing 368 f38<0.0315 248->368 no 503 leaf=-0.0189744 365->503 yes, missing 504 leaf=-0.00285714 365->504 no 505 leaf=-0.000444444 368->505 yes, missing 506 leaf=-0.012 368->506 no

Tuning Hyperparameters via GridSearchCV

  1. num trees & learning rate
  2. Across all trees
    • subsampling rows
    • different loss function
  3. Per tree
    • max depth
    • min sample to split a node
    • min sample in a leaf node
    • subsampling features
In [ ]:
# GridSearch for n_estimators and learning_rate first
cv_sets = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
parameters = {'n_estimators':[1000, 2000], 'learning_rate':[0.01, 0.05]}

grid = GridSearchCV(xgbc, parameters, cv=cv_sets, scoring = 'roc_auc', n_jobs=3) # 'f1' scoring
# grid.fit(features_wCID_11_train_ds, labels_11_train_ds)
grid.fit(features_train, labels_train)
print("The best parameters are %s with AUC of %0.2f" % (grid.best_params_, grid.best_score_))
print grid.best_estimator_